
load(file = paste("data/interim/", poll, "/master_file_", poll,".RData", sep=""))
load(file = paste("data/interim/", poll, "/data_annual_", poll,".RData", sep=""))
load(file = paste("data/interim/", poll, "/data_daily_comb_", poll,".RData", sep=""))

assign(paste("data_daily", poll, sep="_"), get(paste("data_daily_comb",poll, sep="_")) %>% distinct(GEOID10,Site.Num,
                                                                                                    POC, year, Date.Local) %>%
         semi_join(get(paste("data_annual",poll, sep="_"))  %>%
                     distinct(GEOID10, Site.Num, POC, year),
                   by = c("GEOID10", "Site.Num", "POC",
                          "year")) %>% mutate(TrueDate = as.Date(Date.Local)))

# Distinct Monitors Info
monitors_distinct <- get(paste("data_master", poll, sep="_")) %>% distinct(GEOID10, Site.Num, POC)

# Start Simulation
# seed the process
set.seed(1234567890)

# get random warnings
sample_event <- function(n, m, ind){
  sample_from = 1:m
  
  remove <- NULL
  for (i in 1:length(ind)){remove <- dplyr::union(remove, seq(ind[i] - 30, ind[i] + 30, by = 1))}
  sample_from <- dplyr::setdiff(sample_from, remove)
  x <- NULL
  
  for (i in 1:n){
    x[i] = sample(sample_from, 1) 
    
    sample_from = dplyr::setdiff(sample_from, seq(x[i]-1, x[i]+1))
  }
  return(x)
}

# First, filter master 
emp_p <- function(data){
  coef_all <- NULL
  coef_all_2 <- NULL
  monitors_distinct <- data %>% distinct(GEOID10, Site.Num, POC)
  for (i in 1:dim(monitors_distinct)[1]){
    dat <- get(paste("data_master", poll, sep="_")) %>% semi_join(monitors_distinct[i,], by=c("GEOID10", "Site.Num", "POC")) %>%
      dplyr::mutate(year=as.factor(lubridate::year(TrueDate)), 
                    month=as.factor(lubridate::month(TrueDate)), day=as.factor(weekdays(TrueDate)))
    warning <- dat %>% dplyr::filter(eday==0)
    year_info <- get(paste("data_annual",poll, sep="_")) %>% semi_join(monitors_distinct[i,], by=c("GEOID10", "Site.Num", "POC"))
    alldates <- NULL
    # get the date sequence
    for (j in 1:dim(year_info)[1]){
      alldates <- c(alldates, seq(as.Date(paste(year_info[j,7], '-01-01', sep='')), 
                                  as.Date(paste(year_info[j,7], '-12-31', sep='')), 
                                  by = '+1 day'))
    }
    # Add year, month info
    alldates <- cbind(alldates, month <- lubridate::month(as.Date(alldates, origin="1970-01-01")), 
                      year <- lubridate::year(as.Date(alldates, origin="1970-01-01"))) %>% as.data.frame() %>%
      dplyr::rename(month=2, year=3) 
    
    # filter those exist in daily X monitor data
    alldates <- alldates %>% 
      inner_join(get(paste("data_daily_comb", poll, sep="_")) %>% inner_join(monitors_distinct[i,], by=c("GEOID10", "Site.Num", "POC")) %>% 
                   dplyr::mutate(month=lubridate::month(TrueDate)) %>% distinct(GEOID10, Site.Num, POC, year, month), by=c("year", "month"))
    
    ind <- which(alldates[,1] %in% warning$TrueDate)
    len <- dim(alldates)[1]
    # Repeat sampling for 1000 times
    event <- try(map(seq_len(1000), ~sample_event(dim(warning)[1],len, ind)), silent = TRUE)
    if ('try-error' %in% class(event)) {coef_all <- coef_all
    coef_all_2 <- coef_all_2} else {
      events <- unlist(event, use.names = FALSE)
      alldates <- alldates[,1]
      event_df <- matrix(alldates[events], byrow = FALSE, nrow=dim(warning)[1]) %>% as.data.frame() 
      event_exp <- event_df %>% as.data.frame() %>% group_by_all() %>%
        tidyr::expand(eday = seq(-30, 30, by = 1))
      
      dat_monitor <- get(paste("data_daily", poll, sep="_")) %>% semi_join(monitors_distinct[i,], by=c("GEOID10", "Site.Num", "POC"))
      coefficients <- NULL
      coefficients_2 <- NULL
      
      for (k in 1:1000) {
        reg_dat <- event_exp[c(k,1001)] %>% dplyr::rename(ForecastDate = 1) %>% 
          dplyr::mutate(ForecastDate=as.Date(ForecastDate, origin="1970-01-01")) %>%
          dplyr::mutate(TrueDate = ForecastDate + eday)%>% 
          left_join(dat_monitor, by= c("TrueDate")) %>% 
          dplyr::mutate(d_missing=as.numeric(is.na(GEOID10))) %>% 
          dplyr::select(ForecastDate, GEOID10, eday, TrueDate, d_missing) %>% 
          dplyr::mutate(eday=as.factor(eday), year=as.factor(lubridate::year(TrueDate)), 
                        month=as.factor(lubridate::month(TrueDate)), day=as.factor(weekdays(TrueDate)))
             
        model <- lm(-d_missing ~ 0 + eday, data=reg_dat)
        coef <- rep(0,61)
        
        for(l in 1:length(model$coefficients)){
          if(str_sub(names(model$coefficients)[l],1,4)=="eday" & str_length(names(model$coefficients)[l])==7) {
            coef[31 + as.numeric(str_sub(names(model$coefficients)[l],-3,-1))] <- model$coefficients[l]} 
          else if(str_sub(names(model$coefficients)[l],1,4)=="eday" & str_length(names(model$coefficients)[l])==5) {
            coef[31 + as.numeric(str_sub(names(model$coefficients)[l],-1,-1))] <- model$coefficients[l] } 
          else if(str_sub(names(model$coefficients)[l],1,4)=="eday" & str_length(names(model$coefficients)[l])==6) {
            coef[31 + as.numeric(str_sub(names(model$coefficients)[l],-2,-1))] <- model$coefficients[l]} 
        }
        
        coefficients <- cbind(coefficients, coef)
      }
      
      model_true <- lm(-d_missing ~ 0 + eday, data=dat)
      coef_true_1 <- rep(0,61)
      for(l in 1:length(model_true$coefficients)){
        if(str_sub(names(model_true$coefficients)[l],1,4)=="eday" & str_length(names(model_true$coefficients)[l])==7) {
          coef_true_1[31 + as.numeric(str_sub(names(model_true$coefficients)[l],-3,-1))] <- model_true$coefficients[l]} 
        else if(str_sub(names(model_true$coefficients)[l],1,4)=="eday" & str_length(names(model_true$coefficients)[l])==5) {
          coef_true_1[31 + as.numeric(str_sub(names(model_true$coefficients)[l],-1,-1))] <- model_true$coefficients[l] } 
        else if(str_sub(names(model_true$coefficients)[l],1,4)=="eday" & str_length(names(model_true$coefficients)[l])==6) {
          coef_true_1[31 + as.numeric(str_sub(names(model_true$coefficients)[l],-2,-1))] <- model_true$coefficients[l]} 
      }
      
      coefficients <- cbind(coefficients, coef_true_1, rep(monitors_distinct[i,],61))
      coef_all <- rbind(coef_all, coefficients)
      }}
  return(coef_all)
}

emp <- emp_p(get(paste("data_master", poll, sep="_"))) 

result <- emp %>% clean_names()
save(result, file = paste("data/interim/", poll, "/result_",poll, "_.RData", sep=""))